pacman::p_load(tidyverse, dplyr, tidyr, readxl )

job = read_excel(
  'DataSets.xlsx',
  sheet = 'project',
  range = 'I8:O405'
)
str(job)

vars = c('rank', 'discipline', 'sex')
job[vars] = lapply(job[vars], factor)

# multiple linear regression
linmod = lm(salary ~ rank + discipline +
              yrs.phd + yrs.service + sex,
            data = job)
library(jtools)
summ(linmod, confint = TRUE, robust = TRUE)

# confint = TRUE will show confidence intervals of the regression coeffcients
# robust = TRUE will give us heteroskedasticity robust standard error
# Now we will check multicollinearity and heterokedasticity using vif values and bp test along with diagnostic plots.

library(car)
vif(linmod)

linmod2 = lm(salary ~ rank + discipline +
               yrs.phd, data = job)
vif(linmod2)

qplot(job$yrs.phd, job$yrs.service,
      geom = c('smooth', 'point'))

# heteroskedasticity
library(lmtest)
bptest(linmod2)
# p < 0.05, H0: No heteroskedasticity (rejected)
# We robust standard error

summ(linmod2, robust = TRUE)

linmod3 = lm(salary ~ rank + discipline +
               yrs.service,
             data = job)
vif(linmod3)
bptest(linmod3)
summ(linmod3, robust = TRUE)


# yrs.phd and yrs.service are individually not significant but when they are both present in the model, they  show siginificant result. Therefore, we should keep both off them.

summ(linmod, robust = TRUE)


# diagnostic plots
plot(linmod, 1)
plot(linmod, 2)
plot(linmod, 3)
plot(linmod, 4)

# plot salary vs yrs.service
ggplot(job) + 
  aes(x = yrs.service, y = salary) +
  geom_point() +
  geom_smooth(se = FALSE) +
  geom_smooth(method = 'lm', se = FALSE, color = 'black', size = 2)

# variable selection using step wise regression. Keep the last model
summ(step(linmod))

# If step() does not work, use stepAIC() from MASS package

# predicted values
pred = predict(linmod)
obs = job$salary
resid = linmod$residuals

compare = data.frame(obs, pred, resid)
compare
